import re
from simhash import Simhash
def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

print('%x' % Simhash(get_features('How are you? I am fine. Thanks.')).value)
print('%x' % Simhash(get_features('How are u? I am fine.     Thanks.')).value)
print('%x' % Simhash(get_features('How r you?I    am fine. Thanks.')).value)

print('-------------------')
s1 = '中国生态经济学会第五届会员代表大会暨全国生态经济建设研讨会在南昌召开'
s2 = '斑蝥素诱导人肺癌A549细胞凋亡及其分子机制的研究'
s3 = '肋骨部分切除肋膈角闭合术在肝脏肿瘤高强度聚焦超声治疗中的应用'
s4 = '肝脏肿瘤高强度聚焦超声治疗'
print(s1+'--与--'+s2+'的海明距离为：'+str(Simhash(s2).distance(Simhash(s1))))
print(s2+'--与--'+s3+'的海明距离为：'+str(Simhash(s2).distance(Simhash(s3))))
print(s2+'--与--'+s4+'的海明距离为：'+str(Simhash(s2).distance(Simhash(s4))))
print(Simhash('aa').distance(Simhash('aa')))
